pacman::p_load(tidyr,stringi,stringdist,fuzzyjoin,stringr)
#dir
setwd("C:/Users/Andrei/Google Drive/Research/20190300 Romania Bac/")
  
years_adm<-2017:2004
data<-data_frame()
  
  for (y in years_adm){
    admission_file<-paste("data_adm_raw",y,".rds",sep="")
    data_temp<-readRDS(admission_file) %>% group_by(judet,liceu_repartizat_orig,liceu_repartizat) %>% summarize(n=n()) %>% mutate(an=y)
    
    data<-rbind(data,data_temp)
  }
  
  data<-data %>% filter(liceu_repartizat!='')
  
  #get towns
  data$town_hs_adm<-NA
  setwd("C:/Users/Andrei/Google Drive/Research/20190300 Romania Bac/SIIIR Codes/")
  eval(parse('get_town_adm_hs.R', encoding = 'UTF-8'))
  data<-get_town_adm_hs(data)
  
  #1.1 Figure out in which county a middle school (plausibly is)
  #The idea is that on the HS entrance exam, there is only the county of the high school.
  #We don't know the county of the middle school. But if a middle school sends many students
  # to a HS in one county, it is also plausibly in the same county
  school_words<-c('SCOALA CU CLASELE I-VIII |SCOALA GIMNAZIALA |SCOALA GENERALA CLASELE I-VIII| SCOALA CLASELE I-VIII |SCOALA GENERALA |SCOALA CU CL\\. I-VIII\\. |SCOALA CU CL\\. I-VIII ')
  hs_words<-paste0("TEHNOLOGIC DE TURISM SI ALIMENTATIE|CU CLASELE|I-XIII|I-VIII|I-X|SCOALACU CLS\\.|CU CL\\.|CL\\.|COLEGIUL TEHNIC|MARMATIA|AL BANATULUI MONTAN|",
                   "PARTICULAR|DE ARTA|DE INDUSTRIE ALIMENTARA|DEFICIENTI DE VEDERE|BILINGV ROMINO-CROAT|",
                   "DE CONSTRUCTII SI PROTECTIA MEDIULUI|PROFESIONALA DE COOPERATIE|",
                   "DE PROTECTIA MEDIULUI |WALDORF|FEG|DE VEST|",
                   "PENTRU EDUCATIE INCLUZIVA|DE ARTE SI MESERII|GRUPUL SCOLAR|",
                   "CONSTR. CAI FERATE|DE CHIMIE INDUSTRIALA|",
                   "CONSTRUCTII MONTAJ|INDUSTRIAL MINIER|MINIER|",
                   "DE COOPERATIE|INDUSTRIE USOARA|ECONOMIC-ADMINISTRATIV|",
                   "MESERII SI SERVICII|INDUSTRIE MICA SI SERVICII|NATIONALA DE GAZ|",
                   "DE MUZICA SI ARTE PLASTICE|CU PROGRAM DE ATLETISM|DE MUZICA|",
                   "ADMINISTRATIV SI DE SERVICII|ALIMENTATIE PUBLICA|CU PROGRAM SPORTIV|",
                   "FORESTIER|INDUSTRIA STICLEI|COOPERATIST|COOPERATIE|",
                   "DE MARINA|DE AGRICULTURA SI ECONOMIE|AGRICULTURA SI INDUSTRIE ALIMENTARA|PENTRU|AGRICULTURA|INDUSTRIE ALIMENTARA|CONSTRUCTII CAI FERATE|",
                   "DE TRANSPORTURI AUTO|BILINGV ROMANO-CROAT|TEHNIC DE TRANSPORTURI|",
                   "SPECIAL PENTRU DEFICIENTI DE AUZ|ADVENTIST|DEFICIENTI DE VEDERE|TEHNIC|DE MATERIAL RULANT|TRANSPORTURI FEROVIARE|DEFICIENTI DE AUZ|SPECIAL|",
                   "TRANSPORTURI CAI FERATE|VOCATIONAL DE ARTA|ROMANO-CATOLIC|SPECIAL|AUTO |",
                   "SILVIC|TEHNIC|CONSTRUCTII DE MASINI|METALURGIC|PETROL|",
                   "BANATEAN|CARASAN|ALIMENTARA|PENTICOSTAL|BAPTIST|AGRICOL|DE AFACERI|",
                   "REFORMAT|TEOLOGIC|ORTODOX|TEHNOLOGIC FORESTIER|TEHNOLOGIC CONSTRUCTII DE MASINI|",
                   "SPORTIV|SANITAR|ENERGETIC|TEHNOLOGIC|PEDAGOGIC|TEORETIC|",
                   "DE INFORMATICA|ECONOMIC|TRANSPORTURI|ADMINISTRATIV|DE TURISM|PENTRU TRANSPORT RUTIER|",
                   "MAGHIARA|TELECOMUNICATII|DE ARTE|CONSTRUCTII|SERVICII|AUTOMECANICA|NATIONAL|",
                   "FEROVIAR|TEXTIL|GR\\. SC\\.|CENTRUL SCOLAR|GRUP SCOLAR INDUSTRIAL|",
                   "THE CAMBRIDGE INTERNATIONAL SCHOOL IN|DIN |GRI|GRUP SCOLAR|GRUPUL SCOLAR INDUSTRIAL|GRUPUL SCOLAR|LICEUL|",
                   "N[0-9]|NR [0-9]|NR\\. [0-9]|NR\\.[0-9]|NR\\.|NR|NUMARUL [0-9]|SCOALA|DE CONSTRUCTII-MONTAJ|COMPLEXUL SCOLAR|",
                   "SCOLAR|MONAHAL|DE COOPERATIE|MILITAR|AGROMONTAN|DE ECOLOGIE SI PROTECTIA MEDIULUI|NAVAL|PROTECTIA MEDIULUI|INDUSTRIA STICLEI|INDUSTRIE MICA|SERVICII|SI SERVICII|",
                   "CENTRU DE STUDII|CENTRUL DE STUDII|DE CHIMIE|COM\\.|MUN\\.|MUNICIPIUL|ORAS|AUTO|DE ECOLOGIE SI PROT\\. MED\\.|TEHNOLOGIC|",
                   "GRUPUL SCOLARDE ECOLOGIE SI PROT\\. MED\\.|DE AGROTURISM|POSTA|AGROTURISM|DE POSTA SI TELECOMUNICATII| DE CONSTRUCTII|\\(|\\)|",
                   "COLEGIUL|SEMINARUL|SI PROTECTIA MEDIULUI|AUTOMATIZARI|ELECTRONICA|DE ELECTRONICA SI AUTOMATIZARI|UCECOM|ECOLOGIC|",
                   "DE COMUNICATII|GRECO-CATOLIC|AGROINDUSTRIAL|AGROMONTAN|SCOALA|",
                   "ORASUL|ENERGETIC|CLASELE|I-VIII|COMUNA|TEHNOLOGIC|MESERII|BILINGV|GRUPUL|",
                   "PENTRU|SC\\. GEN\\.|LICEAL|LOCALITATEA|GERMAN|LICEUL")
  
  #do some high school-specific matching
  technical<-"(TEHNIC)|(TEHNOLOGIC)|(INDUSTRIAL)|(AGRICOL)|(SILVIC)|(UCECOM)|(PROFESIONALA)|(ECONOMIC)|(GRUP SCOLAR)|(GRUPUL SCOLAR)|(GR\\. SC\\.)|(CONSTRUCTII)"
  theory<-"(TEORETIC)|(PEDAGOGIC)|(COLEGIUL NATIONAL)"
  theology<-"(MONAHAL)|(TEOLOGIC)|(SEMINAR)|(ORTODOX)|(REFORMAT)|(CATOLIC)|(GRECO)|(CRESTIN)|(PENTICOSTAL)|(ADEVNTIST)|(BAPTIST)"
  sports<-"(SPORTIV)|(ATLETISM)|(SPORT)"
  private<-"(PARTICULAR)|(PRIVAT)"
  special<-"(SPECIAL)|(INCLUZIV)|(DEFICIENT)"
  arts<-"( DANS)|(COREGRAFIE)|(MUZICA)|(DE ARTA)|(DE ARTE)|(PLASTICE)|(PLASTICA)|(ARTISTIC)"
  language<-"(CROAT)|(BILINGV)|(MAGHIAR)|(GERMAN)"
  school<-school_words
  
  types_hs<-list(technical,theory,theology,sports,private,special,arts)
  
  ###########################################LOAD DATA
  ########STUDENT DATA
  hs<-data 
  
  
  hs<-hs %>%
    #filter(judet.ms==county) %>% 
    mutate(school_number=str_extract(liceu_repartizat,'(?<=[NR ])[0-9]+')) %>%
    
    mutate(school_name_stripped=gsub(hs_words,"",liceu_repartizat)) %>%
    mutate(school_name_stripped=trimws(gsub('\\.','',school_name_stripped,perl=T))) %>%
    mutate(school_name_stripped=gsub(',','',school_name_stripped,perl=T)) %>%
    mutate(school_name_stripped=ifelse(is.na(school_name_stripped),"",school_name_stripped)) %>%
    mutate(school_name_stripped=gsub("[[:punct:]]","",school_name_stripped))  %>%
    mutate(school_name_stripped=trimws(gsub("\\s+"," ",school_name_stripped,perl=T)))  %>%
    
    rowwise %>%
    mutate(school_name_stripped_no_town=trimws(gsub(town_hs_adm,"",school_name_stripped))) %>%
    rowwise %>%
    mutate(school_name_no_town=trimws(gsub(town_hs_adm,"",liceu_repartizat))) %>%
    ungroup %>%
    mutate(school_name_quote=stri_extract_all_regex(liceu_repartizat, '(?<=").*?(?=")')) %>%
    mutate(school_name_quote=trimws(gsub('\\.','',school_name_quote,perl=T))) %>%
    mutate(school_name_quote=gsub(',','',school_name_quote,perl=T)) %>%
    mutate(school_name_quote=ifelse(is.na(school_name_quote),"",school_name_quote)) %>%
    mutate(school_name_quote=gsub("[[:punct:]]","",school_name_quote))  %>%
    mutate(school_name_quote=trimws(gsub("\\s+"," ",school_name_quote,perl=T)))  %>%
   
    mutate(school_name_minimal=gsub('\"[^\"]+\"','',liceu_repartizat,perl=T)) %>%
    mutate(school_name_minimal=gsub(hs_words,'',school_name_minimal,perl=T)) %>%
    mutate(school_name_minimal=gsub("[[:punct:]]","",school_name_minimal))  %>%
    mutate(school_name_minimal=trimws(gsub("\\s+"," ",school_name_minimal,perl=T)))  %>%
   
    mutate(school_name_abbrev=gsub('((?<!\\b)[^\\s](?=[a-zA-Z]+[\\s]|[\\s])|\\.)','',school_name_quote,perl=T)) %>%
    mutate(school_name_abbrev=gsub("[[:punct:]]","",school_name_abbrev))  %>%
    mutate(school_name_abbrev=trimws(gsub("\\s+"," ",school_name_abbrev,perl=T)))  %>%

    mutate(school_name_punct=gsub("[[:punct:]]","",liceu_repartizat))  %>%
    mutate(school_name_punct=trimws(gsub("\\s+"," ",school_name_punct,perl=T)))  %>%
    
    
    mutate(technical=grepl(technical,liceu_repartizat))  %>%
    mutate(theory=grepl(theory,liceu_repartizat)) %>%
    mutate(theology=grepl(theology,liceu_repartizat)) %>%
    mutate(sports=grepl(sports,liceu_repartizat)) %>%
    mutate(private=grepl(private,liceu_repartizat)) %>%
    mutate(arts=grepl(arts,liceu_repartizat)) %>%
    mutate(middle=grepl(school_words,liceu_repartizat)) %>%
    mutate(language=grepl(language,liceu_repartizat)) %>%
    mutate(is_school=grepl(school_words,liceu_repartizat))
  
    
  setwd("C:/Users/Andrei/Google Drive/Research/20190300 Romania Bac/Unused Data and Future Ressources/Ministerul Educatiei Geocodare/")
  
  #LOAD GOVERNMENT DATA WITH GPS COORDINATES
  school_data<-read_excel('schools.xlsx') %>% group_by(Judet,`Cod SIIIR`) %>% filter(Nivel%in%c('Liceal','Profesional','Postliceal')) %>% slice(1) %>% ungroup()
  colnames(school_data)[names(school_data)=="Cod SIIIR"]<-'Cod_SIIIR'
  colnames(school_data)[names(school_data)=="Cod SIRUES"]<-'Cod_SIRUES'
  colnames(school_data)[names(school_data)=="Cod siruta"]<-'Cod_SIRUTA'
  school_data$Cod_SIIIR<-as.character(school_data$Cod_SIIIR)
  school_data<-school_data %>% mutate(Cod_SIIIR=ifelse(nchar(Cod_SIIIR)<10,paste0("0",Cod_SIIIR),Cod_SIIIR))
  
  #Load geo data
  geo_data<-read_excel('20170327-coordonategps-scoli.xlsx') 
  #merge data
  gps_data_merged<-base::merge(school_data,geo_data,by.x=c('Cod_SIIIR'),by.y=c('Cod_SIIIR'),all.x=T)
  #Keep relevant fields
  gps_data_merged<-gps_data_merged %>% 
    select(Judet,Denumire,Localitate,LAT,LONG,Cod_SIIIR,Cod_SIRUES,Cod_SIRUTA) %>% 
    rename(judet.gps=Judet,school.gps=Denumire,town.gps=Localitate,lat.gps=LAT,lon.gps=LONG) %>%
    filter(!is.na(lat.gps))
  
  #clean fields of special characters
  Sys.setlocale(locale="Romanian")
  source('clean_govt.R')
  gps_data_merged<-clean_govt(gps_data_merged)
  gps_data_merged<-as.data.frame((gps_data_merged))
  gps_data_merged<-gps_data_merged %>% 
    mutate(school_number=str_extract(school.gps,'(?<=[NR ])[0-9]+')) %>%
    
    mutate(school_name_stripped=gsub(hs_words,"",school.gps)) %>%
    mutate(school_name_stripped=trimws(gsub('\\.','',school_name_stripped,perl=T))) %>%
    mutate(school_name_stripped=gsub(',','',school_name_stripped,perl=T)) %>%
    mutate(school_name_stripped=ifelse(is.na(school_name_stripped),"",school_name_stripped)) %>%
    mutate(school_name_stripped=gsub("[[:punct:]]","",school_name_stripped))  %>%
    mutate(school_name_stripped=trimws(gsub("\\s+"," ",school_name_stripped,perl=T)))  %>%
    rowwise %>%
    mutate(school_name_stripped_no_town=trimws(gsub(town.gps,"",school_name_stripped))) %>%
    rowwise %>%
    mutate(school_name_no_town=trimws(gsub(town.gps,"",school.gps))) %>%
    ungroup %>%
    mutate(school_name_quote=stri_extract_all_regex(school.gps, '(?<=").*?(?=")')) %>%
    mutate(school_name_quote=trimws(gsub('\\.','',school_name_quote,perl=T))) %>%
    mutate(school_name_quote=gsub(',','',school_name_quote,perl=T)) %>%
    mutate(school_name_quote=ifelse(is.na(school_name_quote),"",school_name_quote)) %>%
    mutate(school_name_quote=gsub("[[:punct:]]","",school_name_quote))  %>%
    mutate(school_name_quote=trimws(gsub("\\s+"," ",school_name_quote,perl=T)))  %>%
    
    mutate(school_name_minimal=gsub('\"[^\"]+\"','',school.gps,perl=T)) %>%
    mutate(school_name_minimal=gsub(hs_words,'',school_name_minimal,perl=T)) %>%
    mutate(school_name_minimal=gsub("[[:punct:]]","",school_name_minimal))  %>%
    mutate(school_name_minimal=trimws(gsub("\\s+"," ",school_name_minimal,perl=T)))  %>%
    
    mutate(school_name_abbrev=gsub('((?<!\\b)[^\\s](?=[a-zA-Z]+[\\s]|[\\s])|\\.)','',school_name_quote,perl=T)) %>%
    mutate(school_name_abbrev=gsub("[[:punct:]]","",school_name_abbrev))  %>%
    mutate(school_name_abbrev=trimws(gsub("\\s+"," ",school_name_abbrev,perl=T)))  %>%
    
    mutate(school_name_punct=gsub("[[:punct:]]","",school.gps))  %>%
    mutate(school_name_punct=trimws(gsub("\\s+"," ",school_name_punct,perl=T)))  %>%
    
    mutate(technical=grepl(technical,school.gps))  %>%
    mutate(theory=grepl(theory,school.gps)) %>%
    mutate(theology=grepl(theology,school.gps)) %>%
    mutate(sports=grepl(sports,school.gps)) %>%
    mutate(private=grepl(private,school.gps)) %>%
    mutate(arts=grepl(arts,school.gps)) %>%
    mutate(middle=grepl(school_words,school.gps)) %>%
    mutate(language=grepl(language,school.gps)) %>%
    mutate(is_school=grepl(school_words,school.gps))
  
  #remove duplicates; some schools have 2 locations, but these are typically very close to one another
  gps_data_merged<-gps_data_merged %>% 
    group_by(judet.gps,school.gps) %>% 
    slice(1)
    #%>% filter(judet.gps==county)
  

  #match on name
  #hs<-hs %>% group_by(judet,town_hs_adm) %>% mutate(n=n()) %>% ungroup() %>% filter(liceu_repartizat!='HS NOT MATCHED')
  
  hs_matched<-lapply(unique(hs$judet), function(county){
    lhs<-hs %>% filter(judet==county)
    rhs<-gps_data_merged %>% filter(judet.gps==county) 
    
    hs_matched <- stringdist_left_join(lhs,rhs,by=c("liceu_repartizat"="school.gps"),distance_col="dist",max_dist=5) %>%
      group_by(liceu_repartizat) %>%
      arrange(dist) %>%
      slice(1)
    return(hs_matched)
  })
  hs_matched<- bind_rows(hs_matched) %>% filter(dist<4) %>% arrange(-dist)
  hs_unmatched<-anti_join(hs,hs_matched,by=c("judet","liceu_repartizat"))
  
  #match on extracted name
  hs_matched_temp<-lapply(unique(hs$judet), function(county){
    lhs<-hs_unmatched %>% filter(judet==county & school_name_abbrev!="")
    rhs<-gps_data_merged %>% filter(judet.gps==county & school_name_abbrev!="") 
    
    hs_matched <- stringdist_left_join(lhs,rhs,by=c("school_name_abbrev"="school_name_abbrev"),distance_col="dist",max_dist=2) %>%
      group_by(liceu_repartizat) %>%
      arrange(dist) %>%
      slice(1)
    return(hs_matched)
  })
  hs_matched_temp<- bind_rows(hs_matched_temp) %>% filter(dist<4) %>% arrange(-dist)
  hs_matched<-bind_rows(hs_matched,hs_matched_temp) 
  hs_unmatched<-anti_join(hs,hs_matched,by=c("judet","liceu_repartizat"))
  
  #match on other name
  hs_matched_temp<-lapply(unique(hs$judet), function(county){
    lhs<-hs_unmatched %>% filter(judet==county & school_name_stripped!="")
    rhs<-gps_data_merged %>% filter(judet.gps==county & school_name_stripped!="") 
    
    hs_matched <- stringdist_left_join(lhs,rhs,by=c("school_name_stripped"="school_name_stripped"),distance_col="dist",max_dist=2) %>%
      group_by(liceu_repartizat) %>%
      arrange(dist) %>%
      slice(1)
    return(hs_matched)
  })
  hs_matched_temp<- bind_rows(hs_matched_temp) %>% filter(dist<4) %>% arrange(-dist)
  hs_matched<-bind_rows(hs_matched,hs_matched_temp) 
  hs_unmatched<-anti_join(hs,hs_matched,by=c("judet","liceu_repartizat"))
  

  
  #match on other name
  hs_matched_temp<-lapply(unique(hs$judet), function(county){
    lhs<-hs_unmatched %>% filter(judet==county & school_name_stripped_no_town!="")
    rhs<-gps_data_merged %>% filter(judet.gps==county & school_name_stripped_no_town!="")

    hs_matched <- stringdist_left_join(lhs,rhs,by=c("school_name_stripped_no_town"="school_name_stripped_no_town"),distance_col="dist",max_dist=2) %>%
      group_by(liceu_repartizat) %>%
      arrange(dist) %>%
      slice(1)
    return(hs_matched)
  })
  hs_matched_temp<- bind_rows(hs_matched_temp) %>% filter(dist<4) %>% arrange(-dist)
  hs_matched<-bind_rows(hs_matched,hs_matched_temp)
  hs_unmatched<-anti_join(hs,hs_matched,by=c("judet","liceu_repartizat"))
  
  #match name with no town
  hs_matched_temp<-lapply(unique(hs$judet), function(county){
    lhs<-hs_unmatched %>% filter(judet==county & school_name_no_town!="")
    rhs<-gps_data_merged %>% filter(judet.gps==county & school_name_no_town!="")

    hs_matched <- stringdist_left_join(lhs,rhs,by=c("school_name_no_town"="school_name_no_town"),distance_col="dist",max_dist=2) %>%
      group_by(liceu_repartizat) %>%
      arrange(dist) %>%
      slice(1)
    return(hs_matched)
  })
  hs_matched_temp<- bind_rows(hs_matched_temp) %>% filter(dist<4) %>% arrange(-dist)
  hs_matched<-bind_rows(hs_matched,hs_matched_temp)
  hs_unmatched<-anti_join(hs,hs_matched,by=c("judet","liceu_repartizat"))
  
  #match in small towns
  hs_matched_temp<-lapply(unique(hs$judet), function(county){
    lhs<-hs_unmatched %>% filter(judet==county & n==1)
  hs_matched_temp<-lapply(unique(lhs$town_hs_adm), function(town){
    lhs<-hs_unmatched %>% filter(judet==county & town_hs_adm==town) 
    rhs<-gps_data_merged %>% filter(judet.gps==county & stringdist(town.gps,town)<2) 
    hs_matched <- stringdist_left_join(lhs,rhs,by=c("liceu_repartizat"="school.gps"),distance_col="dist",max_dist=100) %>%
      group_by(liceu_repartizat) %>%
      mutate(dist=stringdist(town.gps,town)) %>%
      arrange(dist) %>%
      slice(1) 
  
  
  hs_matched<- bind_rows(hs_matched)
  return(hs_matched)
  })
  return(hs_matched_temp)
  })
  
  hs_matched_temp<- bind_rows(hs_matched_temp) 
  if (nrow(hs_matched_temp)+ncol(hs_matched_temp)>0){
    hs_matched_temp<- hs_matched_temp %>% filter(!is.na(dist)) %>% arrange(-dist)
    hs_matched<-bind_rows(hs_matched,hs_matched_temp) 
    hs_unmatched<-anti_join(hs,hs_matched,by=c("judet","liceu_repartizat"))
  }
  

  hs_matched<-hs_matched %>% select(liceu_repartizat,judet,Cod_SIIIR)
  
  sum(hs_unmatched[hs_unmatched$an==2004,]$n)/sum(hs[hs$an==2004,]$n)
  sum(hs_unmatched[hs_unmatched$an==2005,]$n)/sum(hs[hs$an==2005,]$n)
  sum(hs_unmatched[hs_unmatched$an==2006,]$n)/sum(hs[hs$an==2006,]$n)
  sum(hs_unmatched[hs_unmatched$an==2007,]$n)/sum(hs[hs$an==2007,]$n)
  sum(hs_unmatched[hs_unmatched$an==2008,]$n)/sum(hs[hs$an==2008,]$n)
  sum(hs_unmatched[hs_unmatched$an==2009,]$n)/sum(hs[hs$an==2009,]$n)
  sum(hs_unmatched[hs_unmatched$an==2010,]$n)/sum(hs[hs$an==2010,]$n)
  sum(hs_unmatched[hs_unmatched$an==2011,]$n)/sum(hs[hs$an==2011,]$n)
  sum(hs_unmatched[hs_unmatched$an==2012,]$n)/sum(hs[hs$an==2012,]$n)
  sum(hs_unmatched[hs_unmatched$an==2013,]$n)/sum(hs[hs$an==2013,]$n)
  sum(hs_unmatched[hs_unmatched$an==2014,]$n)/sum(hs[hs$an==2014,]$n)
  sum(hs_unmatched[hs_unmatched$an==2015,]$n)/sum(hs[hs$an==2015,]$n)
  sum(hs_unmatched[hs_unmatched$an==2016,]$n)/sum(hs[hs$an==2016,]$n)
  sum(hs_unmatched[hs_unmatched$an==2017,]$n)/sum(hs[hs$an==2017,]$n)

 